
********************************************************************************
*
* This program takes the set of strongly connected establishments and prepares
* files to send to Matlab to estimate the Sorkin (QJE 2018) model.
*
* There are two output files: 
* connected_moves.csv
* params.csv  
*
********************************************************************************


use currentid persnr year spell using "$temp/cleaned_obs_reconfig_hockey_ver$ver.dta", clear
drop if currentid == 1
drop if missing(spell)

merge 1:1 persnr spell using "$temp\BeH_large", keep(3) nogen keepusing(beh_beruf_num)

merge m:1 beh_beruf_num using "$temp/high_skill_cleaned_obs_all", keepusing(high_skill) nogen keep(1 3)
rename high_skill high_skill_current

drop if year <= $syear 
drop if year >= $eyear

gen workers = 1

rename currentid betnr_orig
gen double currentid = betnr_orig * 10 + high_skill + 1

collapse (sum) workers, by(currentid year)
compress
save "$temp/workers_currentid_year_skill_ver$ver.dta", replace


*-------------------------------------------------------------------------------
* Full dataset
*-------------------------------------------------------------------------------

*Add indicator of belonging to largest strongly connected to the estab matrix 
*id file

import delimited using "$matlab\sconn_idfy\sconn_idfy_skill_ver$ver.csv", clear
rename v1 matrix_estabid
rename v2 in_strconn
merge 1:1 matrix_estabid using "$temp/estab_matrix_ids_orig_skill_ver$ver.dta"
//few obs are lost due to the restrictions imposed in str_conn_input
replace in_strconn=0 if _merge!=3
drop _merge
cap rename a betnr
format estabid %12.0g
recast double betnr
gsort -in_strconn matrix_estabid
gen model_estabid = _n
assert !missing(betnr)
save "$temp/estab_matrix_ids_skill_ver$ver.dta", replace

*Calculate f_o for the strongly connected set
use "$temp/str_conn_input_skill_ver$ver", clear
drop if currentid != 1
gen double betnr = toid
merge m:1 betnr using "$temp/estab_matrix_ids_skill_ver$ver", keepus(in_strconn)
tab _merge in_strconn // the only element in strconn not merged is 1=N which doesnt hire from N

count if _merge == 2 & in_strconn == 1
assert r(N) == 1

keep if in_strconn==1
keep if _merge==3
drop _merge
gen byte flow = 1 
bysort betnr: egen M_in = sum(flow)
egen sum_nj = sum(flow)
gen double f_o = M_in/sum_nj
duplicates drop betnr, force 
keep betnr f_o
count
save "$temp/fo_new_skill_ver$ver", replace


*Re-input the trimmed movers file
use "$temp/str_conn_input_skill_ver$ver", clear

*Get sender in strongly connected set flag
rename currentid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_skill_ver$ver", ///
keepus(matrix_estabid model_estabid in_strconn)
*_merge==1 are receivers who never send - drop these estabs
keep if _merge==3
drop _merge 
rename matrix_estabid matrix_currentid
rename model_estabid model_currentid
rename estabid currentid
rename in_strconn sender_insconn

*Get receiver in strongly connected set flag 
rename toid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_skill_ver$ver", ///
keepus(matrix_estabid model_estabid in_strconn)
replace currentid = 1 if _merge==2
replace matrix_currentid = 1 if _merge==2
replace model_currentid = 1 if _merge==2
replace sender_insconn=1 if _merge==2
drop _merge
rename estabid toid
rename matrix_estabid matrix_toid 
rename model_estabid model_toid
rename in_strconn recvr_insconn

*check for square matrix in str_conn set 
count if sender_insconn==1 & recvr_insconn==1
sum model_currentid if sender_insconn==1 & recvr_insconn==1, d
local max1 = r(max)
sum model_toid if sender_insconn==1 & recvr_insconn==1, d
local max2 = r(max)
* both max of the two sums should be the same
assert `max1' == `max2'


*Restrict moves to strongly connected set
gen strongly_connected= (sender_insconn==1 & recvr_insconn==1)
tab strongly_connected
keep if strongly_connected==1

*Add offer distribution data (f_o)
rename currentid betnr
merge m:1 betnr using "$temp/fo_new_skill_ver$ver"
assert betnr==1 if _merge!=3
tab _merge
rename betnr currentid
drop _merge
compress
save "$temp/model_input_new_skill_ver$ver", replace


*Export moves to csv for model estimation in matlab
rename betnr_current betnr // using "real" betnr 
merge m:1 betnr year q using "$results/estab_eeenrates_byyr_ver$ver", keepusing(eerate_avgall enrate_avgall bin)
drop if _merge == 2
drop _merge
rename betnr betnr_current 

merge m:1 currentid year using "$temp\workers_currentid_year_skill_ver$ver"
drop if _merge == 2
drop _merge

rename currentid betnr 
bysort betnr year: gen tag = (_n == 1)
bysort betnr: egen double tot_empl = sum(workers) if tag == 1
egen double W = sum(tag * workers) // total number of workers ever employed in the str connected set
gen double g_temp = tot_empl/W
bysort betnr: egen g = mean(g_temp) // relative size of establishment
drop g_temp tag workers
rename betnr currentid


sum W
scalar W = r(mean)
gen double i=.
gen double j=.
replace i = toid
replace j = currentid
drop currentid toid
keep i j g eerate_avgall enrate_avgall f_o bin year
*Replace bin equal to some positive number so that all NE transitions are endog
assert bin==. if j==1
replace bin=0.5 if bin==.
order i j eerate_avgall enrate_avgall g f_o bin

replace i = 2 if i == 1
replace j = 2 if j == 1

assert !missing(eerate_avgall) if j!=2
assert !missing(enrate_avgall) if j!=2
assert !missing(f_o) if j!=2

* add industry codes (to calculate delta and roh by sector)
gen double betnr = floor(j/10)
merge m:1 betnr using "$temp/wz.dta", keepusing(ind sector) 
drop if _merge == 2
drop _merge
drop betnr

sort ind
egen sector_num = group(ind)
su sector_num
global max_ind = r(max)
replace sector_num = $max_ind + 1 if missing(ind) & j != 2 // ee- or en-flows with missing industry
replace sector_num = $max_ind + 2 if missing(ind) & j == 2 // ne-flows with missing industry

drop sector ind year
rename sector_num sector

format i %16.0g 
format j %16.0g

export delimited using "$matlab\data_new\connected_moves_skill_ver$ver.csv", datafmt   replace
